#!usr/bin/python
#coding:utf-8

#这是一个没有破解反爬虫机制的爬虫实验


import re
import urllib2
from bs4 import BeautifulSoup

def OpenPage(url):
    Myheaders = {}

    request = urllib2.Request(url, headers = Myheaders)
    f = urllib2.urlopen(request)
    data = f.read()

    return data.decode("GBK", errors = "ignore").encode("uft-8")

def ParseMainPage(page):
    soup = BeautifulSoup(page, "html.parser")

    ListCharts = soup.find_all(href = re.compile("read"))
    UrlList = ["http://www.shengxu6.com" + item["href"] for item in ListCharts]

    return UrlList

def ParseDetailPage(page):
    soup = BeautifulSoup(page, "html.parser")

    title = soup.find_all(class_ = "panel-heading")[0].get_text()
    content = soup.find_all(class_ = "content-body")[0].get_text()
    return title, content[:-12]

def WriteDataToFile(data):
    with open("output.txt", "a+") as f:
        f.write(data)

if __name__ == "__main__":
    Get = raw_input("输入要爬取得小说网址：")
    MainPage = OpenPage(Get)
    GetUrl = ParseMainPage(MainPage)
    for item in GetUrl:
        print "Clone" + item
        page = OpenPage(item)
        title, content = ParseDetailPage(page)
        print "Clone title is" + title
        data = "\n\n\n" + title + "\n\n\n" + content
        WriteDataToFile(data.encode("utf-8"))







































